In [69]:
# Initializing the environment

import pandas as pd
import numpy as np

# I dont like warnings
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

# Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'
In [70]:
# Loading daaset

cancer = pd.read_csv("D:/MachineLearning/exercises-linear-regression-exercise-1/original/cancer_reg.csv")
household = pd.read_csv("D:/MachineLearning/exercises-linear-regression-exercise-1/original/avg-household-size.csv")

Merging the two data files on the basis of geography

In [71]:
new_cancer = pd.merge(cancer, household, on='geography')
new_cancer.columns
Out[71]:
Index(['avganncount', 'avgdeathsperyear', 'target_deathrate', 'incidencerate',
       'medincome', 'popest2015', 'povertypercent', 'studypercap', 'binnedinc',
       'medianage', 'medianagemale', 'medianagefemale', 'geography',
       'percentmarried', 'pctnohs18_24', 'pcths18_24', 'pctsomecol18_24',
       'pctbachdeg18_24', 'pcths25_over', 'pctbachdeg25_over',
       'pctemployed16_over', 'pctunemployed16_over', 'pctprivatecoverage',
       'pctprivatecoveragealone', 'pctempprivcoverage', 'pctpubliccoverage',
       'pctpubliccoveragealone', 'pctwhite', 'pctblack', 'pctasian',
       'pctotherrace', 'pctmarriedhouseholds', 'birthrate', 'statefips',
       'countyfips', 'avghouseholdsize'],
      dtype='object')
In [72]:
cancer.columns
Out[72]:
Index(['avganncount', 'avgdeathsperyear', 'target_deathrate', 'incidencerate',
       'medincome', 'popest2015', 'povertypercent', 'studypercap', 'binnedinc',
       'medianage', 'medianagemale', 'medianagefemale', 'geography',
       'percentmarried', 'pctnohs18_24', 'pcths18_24', 'pctsomecol18_24',
       'pctbachdeg18_24', 'pcths25_over', 'pctbachdeg25_over',
       'pctemployed16_over', 'pctunemployed16_over', 'pctprivatecoverage',
       'pctprivatecoveragealone', 'pctempprivcoverage', 'pctpubliccoverage',
       'pctpubliccoveragealone', 'pctwhite', 'pctblack', 'pctasian',
       'pctotherrace', 'pctmarriedhouseholds', 'birthrate'],
      dtype='object')

Lets first look at the data

In [73]:
cancer.head()
Out[73]:
avganncount avgdeathsperyear target_deathrate incidencerate medincome popest2015 povertypercent studypercap binnedinc medianage ... pctprivatecoveragealone pctempprivcoverage pctpubliccoverage pctpubliccoveragealone pctwhite pctblack pctasian pctotherrace pctmarriedhouseholds birthrate
0 1397.0 469 164.9 489.8 61898 260131 11.2 499.748204 (61494.5, 125635] 39.3 ... NaN 41.6 32.9 14.0 81.780529 2.594728 4.821857 1.843479 52.856076 6.118831
1 173.0 70 161.3 411.6 48127 43269 18.6 23.111234 (48021.6, 51046.4] 33.0 ... 53.8 43.6 31.1 15.3 89.228509 0.969102 2.246233 3.741352 45.372500 4.333096
2 102.0 50 174.7 349.7 49348 21026 14.6 47.560164 (48021.6, 51046.4] 45.0 ... 43.5 34.9 42.1 21.1 90.922190 0.739673 0.465898 2.747358 54.444868 3.729488
3 427.0 202 194.8 430.4 44243 75882 17.1 342.637253 (42724.4, 45201] 42.8 ... 40.3 35.0 45.3 25.0 91.744686 0.782626 1.161359 1.362643 51.021514 4.603841
4 57.0 26 144.4 350.1 49955 10321 12.5 0.000000 (48021.6, 51046.4] 48.3 ... 43.9 35.1 44.0 22.7 94.104024 0.270192 0.665830 0.492135 54.027460 6.796657

5 rows × 33 columns

In [74]:
cancer.tail()
Out[74]:
avganncount avgdeathsperyear target_deathrate incidencerate medincome popest2015 povertypercent studypercap binnedinc medianage ... pctprivatecoveragealone pctempprivcoverage pctpubliccoverage pctpubliccoveragealone pctwhite pctblack pctasian pctotherrace pctmarriedhouseholds birthrate
3042 1962.667684 15 149.6 453.549422 46961 6343 12.4 0.000000 (45201, 48021.6] 44.2 ... 54.9 44.6 31.7 13.2 90.280811 3.837754 0.327613 1.700468 51.063830 7.773512
3043 1962.667684 43 150.1 453.549422 48609 37118 18.8 377.175494 (48021.6, 51046.4] 30.4 ... 53.3 48.6 28.8 17.7 75.706245 2.326771 4.044920 14.130288 52.007937 8.186470
3044 1962.667684 46 153.9 453.549422 51144 34536 15.0 1968.959926 (51046.4, 54545.6] 30.9 ... 52.6 47.8 26.6 16.8 87.961629 2.313188 1.316472 5.680705 55.153949 7.809192
3045 1962.667684 52 175.0 453.549422 50745 25609 13.3 0.000000 (48021.6, 51046.4] 39.0 ... 56.3 49.6 29.5 14.0 92.905681 1.176562 0.244632 2.131790 58.484232 7.582938
3046 1962.667684 48 213.6 453.549422 41193 37030 13.9 0.000000 (40362.7, 42724.4] 26.2 ... 60.3 22.9 25.1 12.6 70.098132 16.590100 3.177753 1.356457 56.040242 8.981723

5 rows × 33 columns

Here, target_deathrate is the target variable that we have to predict

In [75]:
cancer.describe()
Out[75]:
avganncount avgdeathsperyear target_deathrate incidencerate medincome popest2015 povertypercent studypercap medianage medianagemale ... pctprivatecoveragealone pctempprivcoverage pctpubliccoverage pctpubliccoveragealone pctwhite pctblack pctasian pctotherrace pctmarriedhouseholds birthrate
count 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3.047000e+03 3047.000000 3047.000000 3047.000000 3047.000000 ... 2438.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000
mean 606.338544 185.965868 178.664063 448.268586 47063.281917 1.026374e+05 16.878175 155.399415 45.272333 39.570725 ... 48.453774 41.196324 36.252642 19.240072 83.645286 9.107978 1.253965 1.983523 51.243872 5.640306
std 1416.356223 504.134286 27.751511 54.560733 12040.090836 3.290592e+05 6.409087 529.628366 45.304480 5.226017 ... 10.083006 9.447687 7.841741 6.113041 16.380025 14.534538 2.610276 3.517710 6.572814 1.985816
min 6.000000 3.000000 59.700000 201.300000 22640.000000 8.270000e+02 3.200000 0.000000 22.300000 22.400000 ... 15.700000 13.500000 11.200000 2.600000 10.199155 0.000000 0.000000 0.000000 22.992490 0.000000
25% 76.000000 28.000000 161.200000 420.300000 38882.500000 1.168400e+04 12.150000 0.000000 37.700000 36.350000 ... 41.000000 34.500000 30.900000 14.850000 77.296180 0.620675 0.254199 0.295172 47.763063 4.521419
50% 171.000000 61.000000 178.100000 453.549422 45207.000000 2.664300e+04 15.900000 0.000000 41.000000 39.600000 ... 48.700000 41.100000 36.300000 18.800000 90.059774 2.247576 0.549812 0.826185 51.669941 5.381478
75% 518.000000 149.000000 195.200000 480.850000 52492.000000 6.867100e+04 20.400000 83.650776 44.000000 42.500000 ... 55.600000 47.700000 41.550000 23.100000 95.451693 10.509732 1.221037 2.177960 55.395132 6.493677
max 38150.000000 14010.000000 362.800000 1206.900000 125635.000000 1.017029e+07 47.400000 9762.308998 624.000000 64.700000 ... 78.900000 70.700000 65.100000 46.600000 100.000000 85.947799 42.619425 41.930251 78.075397 21.326165

8 rows × 31 columns

In [76]:
# regex - geography
In [77]:
cancer.geography.head(2)
Out[77]:
0      Kitsap County, Washington
1    Kittitas County, Washington
Name: geography, dtype: object
In [78]:
# new data frame with split value columns 
a = new_cancer["geography"].str.split(",", n = 1, expand = True)

new_cancer['County'] = a[0]
new_cancer['State'] = a[1]
In [79]:
?str.split
In [80]:
new_cancer.columns
Out[80]:
Index(['avganncount', 'avgdeathsperyear', 'target_deathrate', 'incidencerate',
       'medincome', 'popest2015', 'povertypercent', 'studypercap', 'binnedinc',
       'medianage', 'medianagemale', 'medianagefemale', 'geography',
       'percentmarried', 'pctnohs18_24', 'pcths18_24', 'pctsomecol18_24',
       'pctbachdeg18_24', 'pcths25_over', 'pctbachdeg25_over',
       'pctemployed16_over', 'pctunemployed16_over', 'pctprivatecoverage',
       'pctprivatecoveragealone', 'pctempprivcoverage', 'pctpubliccoverage',
       'pctpubliccoveragealone', 'pctwhite', 'pctblack', 'pctasian',
       'pctotherrace', 'pctmarriedhouseholds', 'birthrate', 'statefips',
       'countyfips', 'avghouseholdsize', 'County', 'State'],
      dtype='object')
In [81]:
cancer.head(1)
Out[81]:
avganncount avgdeathsperyear target_deathrate incidencerate medincome popest2015 povertypercent studypercap binnedinc medianage ... pctprivatecoveragealone pctempprivcoverage pctpubliccoverage pctpubliccoveragealone pctwhite pctblack pctasian pctotherrace pctmarriedhouseholds birthrate
0 1397.0 469 164.9 489.8 61898 260131 11.2 499.748204 (61494.5, 125635] 39.3 ... NaN 41.6 32.9 14.0 81.780529 2.594728 4.821857 1.843479 52.856076 6.118831

1 rows × 33 columns

Geomap

In [82]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode,plot, iplot
In [83]:
import cufflinks as cf
In [84]:
init_notebook_mode(connected=True)
cf.go_offline()
In [85]:
grouped = new_cancer.groupby(['State','statefips'])
new_data = grouped['target_deathrate'].agg({'target_deathrate':sum})
new_data = pd.DataFrame(new_data)
new_data = new_data.reset_index()
new_data.head(10)
Out[85]:
State statefips target_deathrate
0 Alabama 1 12141.9
1 Alaska 2 3481.5
2 Arizona 4 2236.3
3 Arkansas 5 15006.8
4 California 6 9011.5
5 Colorado 8 8493.7
6 Connecticut 9 1261.7
7 Delaware 10 536.6
8 District of Columbia 11 182.3
9 Florida 12 11842.4
In [86]:
new_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 3 columns):
State               51 non-null object
statefips           51 non-null int64
target_deathrate    51 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.3+ KB
In [87]:
states = pd.read_csv('D:/MachineLearning/states.csv')
In [88]:
states.head(3)
Out[88]:
State Abbreviation
0 Alabama AL
1 Alaska AK
2 Arizona AZ
In [89]:
states.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 2 columns):
State           51 non-null object
Abbreviation    51 non-null object
dtypes: object(2)
memory usage: 896.0+ bytes
In [90]:
states.shape
new_data['abbreviation'] = states['Abbreviation']
In [91]:
# Joining State and new_data 
In [92]:
new_data.to_csv("D:/MachineLearning/new_data.csv")
In [93]:
new_data = pd.read_csv("D:/MachineLearning/new_data.csv")
In [94]:
new_data.head(10)
Out[94]:
Unnamed: 0 State statefips target_deathrate abbreviation
0 0 Alabama 1 12141.9 AL
1 1 Alaska 2 3481.5 AK
2 2 Arizona 4 2236.3 AZ
3 3 Arkansas 5 15006.8 AR
4 4 California 6 9011.5 CA
5 5 Colorado 8 8493.7 CO
6 6 Connecticut 9 1261.7 CT
7 7 Delaware 10 536.6 DE
8 8 District of Columbia 11 182.3 DC
9 9 Florida 12 11842.4 FL
In [95]:
new_data.shape
Out[95]:
(51, 5)

Death-rate mapping across United States

In [96]:
data = dict(type = 'choropleth',
           colorscale = 'Portland',
           locations = new_data['abbreviation'],
           z = new_data["target_deathrate"],
           locationmode = "USA-states",
           colorbar = {'title':'Target Death Rate'})
In [97]:
layout = dict(title = "Target death rate due to cancer in the United States",
             geo = dict(scope = 'usa',
                        showlakes = True
                       ))
In [98]:
choromap = dict(data=[data],layout=layout)
In [99]:
iplot(choromap)

The above geomap represents distribution of target death rates across United States. From the graph, the gradient shows that Texas has the highest death rate due to cancer.

Univariate Analysis

Boxplots to check for outliers for the target variable

In [100]:
cancer['target_deathrate'].plot(subplots = True, kind = 'box', layout = (1,1),figsize = [7,7])
plt.show()

There are no outliers in the target variables

Lets check the distributon of the target variable, target deathrate

In [101]:
sns.distplot(cancer['target_deathrate'],color="Green")
plt.xlabel('Target DeathRate')
plt.ylabel('Count')
plt.title('Histogram of Target DeathRate')
plt.show()

Since the target variable is normally distributed, we can directly apply linear regression.

Bivariate Analysis

Multivariate plots allow us to see relationships between two and more different variables, all in one figure.

In [102]:
# Scatter Plot

cancer.plot(kind='scatter', x='incidencerate', y='target_deathrate', alpha=0.2)
plt.xlabel('Incidence Rate')
plt.ylabel('Death Rate')
plt.title('Incidence Rate vs Death Rate')
plt.show()
In [103]:
cancer.plot(kind='scatter', x='medincome', y='target_deathrate', alpha=0.2)
plt.xlabel('Median Income')
plt.ylabel('Death Rate')
plt.title('Median Income vs Death Rate')
plt.show()

There is a slight negative correlation between cancer death rate and median income.